{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "pycharm": {
     "is_executing": false
    }
   },
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "pycharm": {
     "is_executing": false
    }
   },
   "outputs": [],
   "source": [
    "# sample.csv\n",
    "# test_new.csv\n",
    "# train.csv\n",
    "train=pd.read_csv('data/train.csv',sep='\\t')\n",
    "test=pd.read_csv('data/test_new.csv')\n",
    "sub=pd.read_csv('data/sample.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "pycharm": {
     "is_executing": false
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(10000, 2) (2000, 2)\n"
     ]
    }
   ],
   "source": [
    "print(train.shape,test.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 统计评论句子长度"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "pycharm": {
     "is_executing": false
    },
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "train['id']=[i for i in range(len(train))]\n",
    "test['label']=[-1 for i in range(len(test))]\n",
    "df=pd.concat([train,test],sort=False)\n",
    "\n",
    "def token(text):\n",
    "    \"\"\"\n",
    "    实现分词\n",
    "    :param text:文本\n",
    "    :return:\n",
    "    \"\"\"\n",
    "    return \" \".join(jieba.cut(text))\n",
    "df['token_text'] = df['comment'].apply(lambda x: token(x))\n",
    "df['token_text_len'] = df['token_text'].apply(lambda x: len(x.split()))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "pycharm": {
     "is_executing": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>token_text_len</th>\n",
       "      <th>comment_len</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>12000.000000</td>\n",
       "      <td>12000.000000</td>\n",
       "      <td>12000.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>-0.040750</td>\n",
       "      <td>22.076083</td>\n",
       "      <td>33.944667</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>0.539395</td>\n",
       "      <td>21.415742</td>\n",
       "      <td>32.964640</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>-1.000000</td>\n",
       "      <td>2.000000</td>\n",
       "      <td>5.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>10.000000</td>\n",
       "      <td>16.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>14.000000</td>\n",
       "      <td>22.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>25.000000</td>\n",
       "      <td>38.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>179.000000</td>\n",
       "      <td>255.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              label  token_text_len   comment_len\n",
       "count  12000.000000    12000.000000  12000.000000\n",
       "mean      -0.040750       22.076083     33.944667\n",
       "std        0.539395       21.415742     32.964640\n",
       "min       -1.000000        2.000000      5.000000\n",
       "25%        0.000000       10.000000     16.000000\n",
       "50%        0.000000       14.000000     22.000000\n",
       "75%        0.000000       25.000000     38.000000\n",
       "max        1.000000      179.000000    255.000000"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['comment_len']=df['comment'].apply(lambda x:len(x))\n",
    "df.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "pycharm": {
     "is_executing": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>comment</th>\n",
       "      <th>id</th>\n",
       "      <th>comment_len</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1656</th>\n",
       "      <td>0</td>\n",
       "      <td>老顾客了，性价比超高。                       夏天生意也一样好，是因为...</td>\n",
       "      <td>1656</td>\n",
       "      <td>255</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8611</th>\n",
       "      <td>0</td>\n",
       "      <td>串串我吃得多了，但是这家“嘿火”必须要重点给大家推荐一下，首先从此家店的服务说起，店虽然不大...</td>\n",
       "      <td>8611</td>\n",
       "      <td>255</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3565</th>\n",
       "      <td>1</td>\n",
       "      <td>第一次买，首先说配送员，打了三个电话，找不到路，你找不到路是我的错？地址写得清清楚楚你给我说...</td>\n",
       "      <td>3565</td>\n",
       "      <td>255</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6422</th>\n",
       "      <td>0</td>\n",
       "      <td>#酸菜肉丝米线二两#    简单说一下我的感受: 第一送外卖的叔叔把米线送到南北校区马路那里...</td>\n",
       "      <td>6422</td>\n",
       "      <td>255</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>759</th>\n",
       "      <td>0</td>\n",
       "      <td>花了58元，也就是三顿饭的钱，买了虾仁酥和皇帝鱼。感觉这58元，完全就是浪费了。虾仁酥其实就...</td>\n",
       "      <td>759</td>\n",
       "      <td>255</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6350</th>\n",
       "      <td>1</td>\n",
       "      <td>在他们家买了两瓶可口可乐，快喝完一瓶才发现，生产日期是2016年10月24日，保质期九个月，...</td>\n",
       "      <td>6350</td>\n",
       "      <td>255</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4948</th>\n",
       "      <td>0</td>\n",
       "      <td>服务、菜品、分量、环境都非常的好。首先是服务态度，老板和服务员人特别好的好，给我们介绍餐厅，...</td>\n",
       "      <td>4948</td>\n",
       "      <td>255</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2965</th>\n",
       "      <td>1</td>\n",
       "      <td>哎 看了老板的介绍 说是不好做 也看到了评论中的一些问题…我以为老板是可以信赖的人 所以还是...</td>\n",
       "      <td>2965</td>\n",
       "      <td>255</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>346</th>\n",
       "      <td>-1</td>\n",
       "      <td>我从来没吃过这么奇怪又难吃的烧烤！！点了个玉米，点了点素菜，其中包括青椒，结果商家直接给我送...</td>\n",
       "      <td>2d047743-d08e-4eb7-b073-cf5b8f2d10ba</td>\n",
       "      <td>255</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5113</th>\n",
       "      <td>0</td>\n",
       "      <td>本来想去吃小龙虾来着，无意间看见这家烤肉店，看了评论蛮好，又看见照片评论这么多肉，忍不住立马...</td>\n",
       "      <td>5113</td>\n",
       "      <td>255</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5178</th>\n",
       "      <td>1</td>\n",
       "      <td>送来卤鸡蛋是臭的。联系老板，老板说再送一个过来，或者是退我两块钱。这个怎么退钱嘛，我就让她换...</td>\n",
       "      <td>5178</td>\n",
       "      <td>255</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5334</th>\n",
       "      <td>0</td>\n",
       "      <td>首先羊肉味道不错，个人觉得偏辣，所以第二只马上通知厨师别放这么多辣椒，就觉得非常合适，第一只...</td>\n",
       "      <td>5334</td>\n",
       "      <td>255</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3655</th>\n",
       "      <td>0</td>\n",
       "      <td>服务不怎么好，我们加了十块钱想把鲜排改为腊排，但是都动了筷子了才发现给我们换成了辣猪蹄，叫来...</td>\n",
       "      <td>3655</td>\n",
       "      <td>255</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>156</th>\n",
       "      <td>-1</td>\n",
       "      <td>真的要认真点评下。因为想吃日料，和老公在两家不同的店点了餐，想考察下住家附近有哪家店的好吃。...</td>\n",
       "      <td>15f5142b-da2b-4b14-b593-e3eff088a453</td>\n",
       "      <td>255</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5384</th>\n",
       "      <td>1</td>\n",
       "      <td>先说说牛肉吧，被我从来不吃泰国菜的老公吃完了，因为跟中国的回锅肉味道一样一样的，虽然吃起来像...</td>\n",
       "      <td>5384</td>\n",
       "      <td>255</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2809</th>\n",
       "      <td>1</td>\n",
       "      <td>首先我想说我是你们家老顾客了。最起码这几天都是吃你们家饭菜。但是今天真的很让我失望。我中午点...</td>\n",
       "      <td>2809</td>\n",
       "      <td>255</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2829</th>\n",
       "      <td>0</td>\n",
       "      <td>提前半小时就打电话预订了，于是到了餐馆老板就直接给我端上来了。十分的方便。所以要过去的朋友们...</td>\n",
       "      <td>2829</td>\n",
       "      <td>255</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1520</th>\n",
       "      <td>1</td>\n",
       "      <td>我们是老顾客 老板心知肚明吧 。第一次吃到饭里有老鼠*、你说是外卖骑手故意弄的，我们理解了并...</td>\n",
       "      <td>1520</td>\n",
       "      <td>255</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7302</th>\n",
       "      <td>0</td>\n",
       "      <td>位置很好找，过了附近的路口很清楚的就可以看见牌匾，味道太棒了，红油锅非常好吃，不知道哪里透露...</td>\n",
       "      <td>7302</td>\n",
       "      <td>255</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5598</th>\n",
       "      <td>0</td>\n",
       "      <td>来了很多次了 很棒的一家店 #寿司# #海虾芒果卷# #肥牛卷# #日式料理# #炸虾天妇罗...</td>\n",
       "      <td>5598</td>\n",
       "      <td>255</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1101</th>\n",
       "      <td>0</td>\n",
       "      <td>特意飞来重庆就为吃一顿正宗的重庆火锅，吃完后总体感觉还是不错的，食材都很新鲜，味道也棒棒哒。...</td>\n",
       "      <td>1101</td>\n",
       "      <td>255</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3019</th>\n",
       "      <td>1</td>\n",
       "      <td>请各位朋友看过来，我从来没发过评论，但今天真是忍不住了，请各位千万不要再点这个饭，送餐费收了...</td>\n",
       "      <td>3019</td>\n",
       "      <td>255</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>124</th>\n",
       "      <td>0</td>\n",
       "      <td>还可以，糊辣壳很有特色，味道也很好，唯一美中不足的是面条，我比较喜欢吃面，南川的面店基本上都...</td>\n",
       "      <td>124</td>\n",
       "      <td>255</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>286</th>\n",
       "      <td>-1</td>\n",
       "      <td>1环境；老板贴心在地址里面说得很清楚，走到里面二号厅电梯（排队进电梯……人太多了……）上楼即...</td>\n",
       "      <td>2577b3b3-0fe5-43bf-926a-4c520cbe28cf</td>\n",
       "      <td>255</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>170</th>\n",
       "      <td>1</td>\n",
       "      <td>这家店原来这么不卫生，吃到3盒饭都是小强腿，苍蝇，可惜忘了拍，最郁闷的是整只小强，吃得我们吐...</td>\n",
       "      <td>170</td>\n",
       "      <td>255</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8440</th>\n",
       "      <td>0</td>\n",
       "      <td>#榴莲披萨#吃过的最难吃的披萨没有之一，如果不会做披萨可以不用买这个，就单纯卖薯条炸鸡就够了...</td>\n",
       "      <td>8440</td>\n",
       "      <td>255</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2940</th>\n",
       "      <td>0</td>\n",
       "      <td>位置：位于渝中区石油路时代天街A馆3号门对面支路内80米（PS:这是官方说法哈~）实际上 官...</td>\n",
       "      <td>2940</td>\n",
       "      <td>255</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9703</th>\n",
       "      <td>0</td>\n",
       "      <td>这样的商家真是第一次见到，订了那么多次外卖，真是够奇葩，顾客不是人人都那么细心，我没看见饭要...</td>\n",
       "      <td>9703</td>\n",
       "      <td>255</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7597</th>\n",
       "      <td>0</td>\n",
       "      <td>团购的服务员对我们也很热情并没有因为是团购就不管我们，这里的服务员非常的有礼貌，并且还有自助...</td>\n",
       "      <td>7597</td>\n",
       "      <td>255</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1251</th>\n",
       "      <td>0</td>\n",
       "      <td>经常团购美食，吃后很少发帖，刚刚吃了这家的铁板烧，必须赞一个。一是货真价实：平底铁锅上桌，服...</td>\n",
       "      <td>1251</td>\n",
       "      <td>255</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1700</th>\n",
       "      <td>-1</td>\n",
       "      <td>这是回锅肉？请告诉我这个回锅肉？看着都没食欲了，那个送外卖的是个傻子吧，说了那么明显的路线都...</td>\n",
       "      <td>d9261d7a-b76a-4f6a-9382-6ec9fc7fbe85</td>\n",
       "      <td>83</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5547</th>\n",
       "      <td>0</td>\n",
       "      <td>今天和闺蜜一起过七夕，我们下午在这里呆了三个小时左右，很开心，这里的猫咪超级萌萌哒，抱着它们...</td>\n",
       "      <td>5547</td>\n",
       "      <td>83</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1328</th>\n",
       "      <td>-1</td>\n",
       "      <td>比以前订的虾差得太远太远了，总共二十来个虾，烂虾、死虾多，接近一半差虾钳，更甚者还有几个虾头...</td>\n",
       "      <td>ab09f551-e835-4971-a6d8-319c4bebdc40</td>\n",
       "      <td>83</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1700</th>\n",
       "      <td>0</td>\n",
       "      <td>超级棒的一家酒吧，环境跟服务都非常到位，驻唱歌手水准很高！设备也很给力，音乐节奏感十足。气氛...</td>\n",
       "      <td>1700</td>\n",
       "      <td>83</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9531</th>\n",
       "      <td>0</td>\n",
       "      <td>预定配送时间50分钟以后才到货，中间催了店家3次才收到，下楼时骑手还不在，把饭随便一摆人就不...</td>\n",
       "      <td>9531</td>\n",
       "      <td>83</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7982</th>\n",
       "      <td>0</td>\n",
       "      <td>请客团了3桌。菜上得快。加了2个菜，加上饮料啤酒，差不多加了200多块一桌。总体价格还是可以...</td>\n",
       "      <td>7982</td>\n",
       "      <td>83</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6761</th>\n",
       "      <td>0</td>\n",
       "      <td>好喜欢的，老板娘人很nice.手把手的教我们怎么做，哪里还可以做饼干之内的小吃，小孩子吵着没...</td>\n",
       "      <td>6761</td>\n",
       "      <td>83</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1698</th>\n",
       "      <td>-1</td>\n",
       "      <td>吃了以后拉了三天肚子了，在医院治疗肠胃炎输水花了800多，肚子疼死了，晚上都睡不好，吃什么吐...</td>\n",
       "      <td>d8949880-3577-4956-82d6-9798076e2351</td>\n",
       "      <td>83</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5656</th>\n",
       "      <td>0</td>\n",
       "      <td>味道非常不错，喜欢这里的咖喱，猪颈肉，菠萝饭，木瓜雪蛤，小朋友吃非常好 #猪颈肉# #菠萝海...</td>\n",
       "      <td>5656</td>\n",
       "      <td>82</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1445</th>\n",
       "      <td>-1</td>\n",
       "      <td>店里环境不错，干净舒适，老板人很热情。三个人去的，点的一个单人餐一个双人餐，分量足，砂锅肥肠...</td>\n",
       "      <td>b936480f-cdc5-4e3f-ac71-f39771fa5f5d</td>\n",
       "      <td>82</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3318</th>\n",
       "      <td>0</td>\n",
       "      <td>雨下好大，可是蛋糕店里好温暖。这是第N次团购了，贝尔麦莎确实不错，每一样东东都做得不错。买饮...</td>\n",
       "      <td>3318</td>\n",
       "      <td>82</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3457</th>\n",
       "      <td>0</td>\n",
       "      <td>点了牛排，意大利面，还有海鲜饭，味道都很好，特别是团购的，很划得来，这里环境也很不错，水果自...</td>\n",
       "      <td>3457</td>\n",
       "      <td>82</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5074</th>\n",
       "      <td>1</td>\n",
       "      <td>买的鹅翅小就不说了，关健是变质了，打电话电家说不可能，店面离我家十分钟路程不到，快递小哥送得...</td>\n",
       "      <td>5074</td>\n",
       "      <td>82</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7476</th>\n",
       "      <td>1</td>\n",
       "      <td>第一：商家不看备注第二：偷工减料第三：难吃第四：配送晚了三十几分钟第五：图片实物严重不符第六...</td>\n",
       "      <td>7476</td>\n",
       "      <td>82</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7332</th>\n",
       "      <td>0</td>\n",
       "      <td>菜品新鲜，口味纯正，份量足，两人吃很合适，送了花生浆和西瓜，老公讲，这是火锅团购最好吃的一次...</td>\n",
       "      <td>7332</td>\n",
       "      <td>82</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>363</th>\n",
       "      <td>0</td>\n",
       "      <td>味道还是一如既往的好呀，鱼肉非常的嫩也非常入味，配菜也非常好吃，份量也非常足，我们四个人去吃...</td>\n",
       "      <td>363</td>\n",
       "      <td>82</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8192</th>\n",
       "      <td>0</td>\n",
       "      <td>我想说#黔江鸡杂#这是我吃过最难吃的黔江鸡杂，感觉菜是从水里煮好后捞起来加上调料一样，不入味...</td>\n",
       "      <td>8192</td>\n",
       "      <td>82</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8174</th>\n",
       "      <td>0</td>\n",
       "      <td>店家态度很好 开始说好了两点去拿 结果因为中途有事 改了时间 店家态度很好满分 朋友说蛋糕味...</td>\n",
       "      <td>8174</td>\n",
       "      <td>82</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7749</th>\n",
       "      <td>0</td>\n",
       "      <td>不错不错，挺大的蛋糕，我们十多个人在餐厅没吃完，又提回去当早餐吃了，哈哈哈，味道还可以，不是...</td>\n",
       "      <td>7749</td>\n",
       "      <td>82</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3567</th>\n",
       "      <td>1</td>\n",
       "      <td>食品里面吃出来一根头发，这谁还吃得下去，好好的一顿饭吃成那样，找他们退款，就道个歉，说以后注...</td>\n",
       "      <td>3567</td>\n",
       "      <td>82</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2740</th>\n",
       "      <td>1</td>\n",
       "      <td>我尼玛吓一跳，差点把苍蝇当成蚊子吃了！！这是什么情况，太不卫生了吧，第一次喝粥里面有苍蝇这么...</td>\n",
       "      <td>2740</td>\n",
       "      <td>82</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2655</th>\n",
       "      <td>0</td>\n",
       "      <td>不错挺好的，牛排口味也很好，已经去过多次就餐了～请朋友吃饭；来的也是这里水果沙拉，还有虾，芒...</td>\n",
       "      <td>2655</td>\n",
       "      <td>82</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1510</th>\n",
       "      <td>-1</td>\n",
       "      <td>非常好，就是对我比较喜欢吃肉，牛排多点就好了，但是总体能吃饱，西餐注重的是情调，和我闺蜜来的...</td>\n",
       "      <td>c147a742-0f49-40e5-ab66-d705d9b9c9a4</td>\n",
       "      <td>82</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>325</th>\n",
       "      <td>0</td>\n",
       "      <td>小票顶头那么大几个字泡椒肉丝炒粉，这都看不见，，真是醉了，炒粉无色无味，反正那么多差评了，也...</td>\n",
       "      <td>325</td>\n",
       "      <td>82</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1304</th>\n",
       "      <td>0</td>\n",
       "      <td>饭太不行了，一点都不好吃，比学校最便宜的饭还难吃，有点像是剩饭重新加水煮的，肉很难吃，肉质不...</td>\n",
       "      <td>1304</td>\n",
       "      <td>81</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8025</th>\n",
       "      <td>0</td>\n",
       "      <td>老板越来越不地道了，上次来实体店买的30一斤的进口葡萄，保鲜膜套着买了两斤多拿回来很多都是坏...</td>\n",
       "      <td>8025</td>\n",
       "      <td>81</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9795</th>\n",
       "      <td>0</td>\n",
       "      <td>你们那个送餐的把餐送到了嘴巴不干净的在说我们，没听清楚她在念啥子，啥子态度哟！点你们餐那么多...</td>\n",
       "      <td>9795</td>\n",
       "      <td>81</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>586</th>\n",
       "      <td>1</td>\n",
       "      <td>不卫生 不卫生 不卫生 重要的事情说三遍，以前一直都觉得餐盒干净才订的，今天的咖喱鸡吃到一半...</td>\n",
       "      <td>586</td>\n",
       "      <td>81</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7296</th>\n",
       "      <td>0</td>\n",
       "      <td>我们十个人吃的，没吃完，浪费了一些，味道挺好的。上午把验证码给工作人员验证了，晚上直接去吃的...</td>\n",
       "      <td>7296</td>\n",
       "      <td>81</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1233</th>\n",
       "      <td>0</td>\n",
       "      <td>#意式提拉米苏#等了2个小时，结果失望透顶。从来没吃过这么难吃的提拉米苏，还打翻了，直接丢了...</td>\n",
       "      <td>1233</td>\n",
       "      <td>81</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1000 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      label                                            comment  \\\n",
       "1656      0  老顾客了，性价比超高。                       夏天生意也一样好，是因为...   \n",
       "8611      0  串串我吃得多了，但是这家“嘿火”必须要重点给大家推荐一下，首先从此家店的服务说起，店虽然不大...   \n",
       "3565      1  第一次买，首先说配送员，打了三个电话，找不到路，你找不到路是我的错？地址写得清清楚楚你给我说...   \n",
       "6422      0  #酸菜肉丝米线二两#    简单说一下我的感受: 第一送外卖的叔叔把米线送到南北校区马路那里...   \n",
       "759       0  花了58元，也就是三顿饭的钱，买了虾仁酥和皇帝鱼。感觉这58元，完全就是浪费了。虾仁酥其实就...   \n",
       "6350      1  在他们家买了两瓶可口可乐，快喝完一瓶才发现，生产日期是2016年10月24日，保质期九个月，...   \n",
       "4948      0  服务、菜品、分量、环境都非常的好。首先是服务态度，老板和服务员人特别好的好，给我们介绍餐厅，...   \n",
       "2965      1  哎 看了老板的介绍 说是不好做 也看到了评论中的一些问题…我以为老板是可以信赖的人 所以还是...   \n",
       "346      -1  我从来没吃过这么奇怪又难吃的烧烤！！点了个玉米，点了点素菜，其中包括青椒，结果商家直接给我送...   \n",
       "5113      0  本来想去吃小龙虾来着，无意间看见这家烤肉店，看了评论蛮好，又看见照片评论这么多肉，忍不住立马...   \n",
       "5178      1  送来卤鸡蛋是臭的。联系老板，老板说再送一个过来，或者是退我两块钱。这个怎么退钱嘛，我就让她换...   \n",
       "5334      0  首先羊肉味道不错，个人觉得偏辣，所以第二只马上通知厨师别放这么多辣椒，就觉得非常合适，第一只...   \n",
       "3655      0  服务不怎么好，我们加了十块钱想把鲜排改为腊排，但是都动了筷子了才发现给我们换成了辣猪蹄，叫来...   \n",
       "156      -1  真的要认真点评下。因为想吃日料，和老公在两家不同的店点了餐，想考察下住家附近有哪家店的好吃。...   \n",
       "5384      1  先说说牛肉吧，被我从来不吃泰国菜的老公吃完了，因为跟中国的回锅肉味道一样一样的，虽然吃起来像...   \n",
       "2809      1  首先我想说我是你们家老顾客了。最起码这几天都是吃你们家饭菜。但是今天真的很让我失望。我中午点...   \n",
       "2829      0  提前半小时就打电话预订了，于是到了餐馆老板就直接给我端上来了。十分的方便。所以要过去的朋友们...   \n",
       "1520      1  我们是老顾客 老板心知肚明吧 。第一次吃到饭里有老鼠*、你说是外卖骑手故意弄的，我们理解了并...   \n",
       "7302      0  位置很好找，过了附近的路口很清楚的就可以看见牌匾，味道太棒了，红油锅非常好吃，不知道哪里透露...   \n",
       "5598      0  来了很多次了 很棒的一家店 #寿司# #海虾芒果卷# #肥牛卷# #日式料理# #炸虾天妇罗...   \n",
       "1101      0  特意飞来重庆就为吃一顿正宗的重庆火锅，吃完后总体感觉还是不错的，食材都很新鲜，味道也棒棒哒。...   \n",
       "3019      1  请各位朋友看过来，我从来没发过评论，但今天真是忍不住了，请各位千万不要再点这个饭，送餐费收了...   \n",
       "124       0  还可以，糊辣壳很有特色，味道也很好，唯一美中不足的是面条，我比较喜欢吃面，南川的面店基本上都...   \n",
       "286      -1  1环境；老板贴心在地址里面说得很清楚，走到里面二号厅电梯（排队进电梯……人太多了……）上楼即...   \n",
       "170       1  这家店原来这么不卫生，吃到3盒饭都是小强腿，苍蝇，可惜忘了拍，最郁闷的是整只小强，吃得我们吐...   \n",
       "8440      0  #榴莲披萨#吃过的最难吃的披萨没有之一，如果不会做披萨可以不用买这个，就单纯卖薯条炸鸡就够了...   \n",
       "2940      0  位置：位于渝中区石油路时代天街A馆3号门对面支路内80米（PS:这是官方说法哈~）实际上 官...   \n",
       "9703      0  这样的商家真是第一次见到，订了那么多次外卖，真是够奇葩，顾客不是人人都那么细心，我没看见饭要...   \n",
       "7597      0  团购的服务员对我们也很热情并没有因为是团购就不管我们，这里的服务员非常的有礼貌，并且还有自助...   \n",
       "1251      0  经常团购美食，吃后很少发帖，刚刚吃了这家的铁板烧，必须赞一个。一是货真价实：平底铁锅上桌，服...   \n",
       "...     ...                                                ...   \n",
       "1700     -1  这是回锅肉？请告诉我这个回锅肉？看着都没食欲了，那个送外卖的是个傻子吧，说了那么明显的路线都...   \n",
       "5547      0  今天和闺蜜一起过七夕，我们下午在这里呆了三个小时左右，很开心，这里的猫咪超级萌萌哒，抱着它们...   \n",
       "1328     -1  比以前订的虾差得太远太远了，总共二十来个虾，烂虾、死虾多，接近一半差虾钳，更甚者还有几个虾头...   \n",
       "1700      0  超级棒的一家酒吧，环境跟服务都非常到位，驻唱歌手水准很高！设备也很给力，音乐节奏感十足。气氛...   \n",
       "9531      0  预定配送时间50分钟以后才到货，中间催了店家3次才收到，下楼时骑手还不在，把饭随便一摆人就不...   \n",
       "7982      0  请客团了3桌。菜上得快。加了2个菜，加上饮料啤酒，差不多加了200多块一桌。总体价格还是可以...   \n",
       "6761      0  好喜欢的，老板娘人很nice.手把手的教我们怎么做，哪里还可以做饼干之内的小吃，小孩子吵着没...   \n",
       "1698     -1  吃了以后拉了三天肚子了，在医院治疗肠胃炎输水花了800多，肚子疼死了，晚上都睡不好，吃什么吐...   \n",
       "5656      0  味道非常不错，喜欢这里的咖喱，猪颈肉，菠萝饭，木瓜雪蛤，小朋友吃非常好 #猪颈肉# #菠萝海...   \n",
       "1445     -1  店里环境不错，干净舒适，老板人很热情。三个人去的，点的一个单人餐一个双人餐，分量足，砂锅肥肠...   \n",
       "3318      0  雨下好大，可是蛋糕店里好温暖。这是第N次团购了，贝尔麦莎确实不错，每一样东东都做得不错。买饮...   \n",
       "3457      0  点了牛排，意大利面，还有海鲜饭，味道都很好，特别是团购的，很划得来，这里环境也很不错，水果自...   \n",
       "5074      1  买的鹅翅小就不说了，关健是变质了，打电话电家说不可能，店面离我家十分钟路程不到，快递小哥送得...   \n",
       "7476      1  第一：商家不看备注第二：偷工减料第三：难吃第四：配送晚了三十几分钟第五：图片实物严重不符第六...   \n",
       "7332      0  菜品新鲜，口味纯正，份量足，两人吃很合适，送了花生浆和西瓜，老公讲，这是火锅团购最好吃的一次...   \n",
       "363       0  味道还是一如既往的好呀，鱼肉非常的嫩也非常入味，配菜也非常好吃，份量也非常足，我们四个人去吃...   \n",
       "8192      0  我想说#黔江鸡杂#这是我吃过最难吃的黔江鸡杂，感觉菜是从水里煮好后捞起来加上调料一样，不入味...   \n",
       "8174      0  店家态度很好 开始说好了两点去拿 结果因为中途有事 改了时间 店家态度很好满分 朋友说蛋糕味...   \n",
       "7749      0  不错不错，挺大的蛋糕，我们十多个人在餐厅没吃完，又提回去当早餐吃了，哈哈哈，味道还可以，不是...   \n",
       "3567      1  食品里面吃出来一根头发，这谁还吃得下去，好好的一顿饭吃成那样，找他们退款，就道个歉，说以后注...   \n",
       "2740      1  我尼玛吓一跳，差点把苍蝇当成蚊子吃了！！这是什么情况，太不卫生了吧，第一次喝粥里面有苍蝇这么...   \n",
       "2655      0  不错挺好的，牛排口味也很好，已经去过多次就餐了～请朋友吃饭；来的也是这里水果沙拉，还有虾，芒...   \n",
       "1510     -1  非常好，就是对我比较喜欢吃肉，牛排多点就好了，但是总体能吃饱，西餐注重的是情调，和我闺蜜来的...   \n",
       "325       0  小票顶头那么大几个字泡椒肉丝炒粉，这都看不见，，真是醉了，炒粉无色无味，反正那么多差评了，也...   \n",
       "1304      0  饭太不行了，一点都不好吃，比学校最便宜的饭还难吃，有点像是剩饭重新加水煮的，肉很难吃，肉质不...   \n",
       "8025      0  老板越来越不地道了，上次来实体店买的30一斤的进口葡萄，保鲜膜套着买了两斤多拿回来很多都是坏...   \n",
       "9795      0  你们那个送餐的把餐送到了嘴巴不干净的在说我们，没听清楚她在念啥子，啥子态度哟！点你们餐那么多...   \n",
       "586       1  不卫生 不卫生 不卫生 重要的事情说三遍，以前一直都觉得餐盒干净才订的，今天的咖喱鸡吃到一半...   \n",
       "7296      0  我们十个人吃的，没吃完，浪费了一些，味道挺好的。上午把验证码给工作人员验证了，晚上直接去吃的...   \n",
       "1233      0  #意式提拉米苏#等了2个小时，结果失望透顶。从来没吃过这么难吃的提拉米苏，还打翻了，直接丢了...   \n",
       "\n",
       "                                        id  comment_len  \n",
       "1656                                  1656          255  \n",
       "8611                                  8611          255  \n",
       "3565                                  3565          255  \n",
       "6422                                  6422          255  \n",
       "759                                    759          255  \n",
       "6350                                  6350          255  \n",
       "4948                                  4948          255  \n",
       "2965                                  2965          255  \n",
       "346   2d047743-d08e-4eb7-b073-cf5b8f2d10ba          255  \n",
       "5113                                  5113          255  \n",
       "5178                                  5178          255  \n",
       "5334                                  5334          255  \n",
       "3655                                  3655          255  \n",
       "156   15f5142b-da2b-4b14-b593-e3eff088a453          255  \n",
       "5384                                  5384          255  \n",
       "2809                                  2809          255  \n",
       "2829                                  2829          255  \n",
       "1520                                  1520          255  \n",
       "7302                                  7302          255  \n",
       "5598                                  5598          255  \n",
       "1101                                  1101          255  \n",
       "3019                                  3019          255  \n",
       "124                                    124          255  \n",
       "286   2577b3b3-0fe5-43bf-926a-4c520cbe28cf          255  \n",
       "170                                    170          255  \n",
       "8440                                  8440          255  \n",
       "2940                                  2940          255  \n",
       "9703                                  9703          255  \n",
       "7597                                  7597          255  \n",
       "1251                                  1251          255  \n",
       "...                                    ...          ...  \n",
       "1700  d9261d7a-b76a-4f6a-9382-6ec9fc7fbe85           83  \n",
       "5547                                  5547           83  \n",
       "1328  ab09f551-e835-4971-a6d8-319c4bebdc40           83  \n",
       "1700                                  1700           83  \n",
       "9531                                  9531           83  \n",
       "7982                                  7982           83  \n",
       "6761                                  6761           83  \n",
       "1698  d8949880-3577-4956-82d6-9798076e2351           83  \n",
       "5656                                  5656           82  \n",
       "1445  b936480f-cdc5-4e3f-ac71-f39771fa5f5d           82  \n",
       "3318                                  3318           82  \n",
       "3457                                  3457           82  \n",
       "5074                                  5074           82  \n",
       "7476                                  7476           82  \n",
       "7332                                  7332           82  \n",
       "363                                    363           82  \n",
       "8192                                  8192           82  \n",
       "8174                                  8174           82  \n",
       "7749                                  7749           82  \n",
       "3567                                  3567           82  \n",
       "2740                                  2740           82  \n",
       "2655                                  2655           82  \n",
       "1510  c147a742-0f49-40e5-ab66-d705d9b9c9a4           82  \n",
       "325                                    325           82  \n",
       "1304                                  1304           81  \n",
       "8025                                  8025           81  \n",
       "9795                                  9795           81  \n",
       "586                                    586           81  \n",
       "7296                                  7296           81  \n",
       "1233                                  1233           81  \n",
       "\n",
       "[1000 rows x 4 columns]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.sort_values(by='comment_len',ascending=False)[:1000]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "可以看出来，所有句子的最大长度为255，最小长度为5，只有12000条数据中只有不到1000多条的长度才超过100"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 类别统计"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "pycharm": {
     "is_executing": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    8489\n",
       "1    1511\n",
       "Name: label, dtype: int64"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train['label'].value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "正负样本比例大约为1:6"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "pycharm": {
     "is_executing": false
    },
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>comment</th>\n",
       "      <th>id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>1</td>\n",
       "      <td>恶心，不想说了，喝嘴里嚼两下觉得口感不对，吐出来就是只小蟑螂，关键是我还嚼了两口，昨天喝了，...</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>1</td>\n",
       "      <td>真的无语，上两次还觉得可以，今天这个菠萝包里头奶油都臭了，完全不能吃</td>\n",
       "      <td>19</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>1</td>\n",
       "      <td>饭里面居然还有虫，好恶心，不想吃了！差评！</td>\n",
       "      <td>21</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>1</td>\n",
       "      <td>给一个星星❤️完全是给外卖小哥的，为了给我女朋友送外卖都摔了，心疼。但是这个芒果不能忍，女朋...</td>\n",
       "      <td>22</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>1</td>\n",
       "      <td>吃比萨还有虫子，这是什么商家？</td>\n",
       "      <td>23</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>44</th>\n",
       "      <td>1</td>\n",
       "      <td>别的味道我先不说   吃了拉肚子才是真的</td>\n",
       "      <td>44</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>47</th>\n",
       "      <td>1</td>\n",
       "      <td>和室友一起吃了全体拉肚子 今天是第二天又拉了三次</td>\n",
       "      <td>47</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>52</th>\n",
       "      <td>1</td>\n",
       "      <td>#【鲜切】海南香蕉王#吃完反胃想吐，整晚都干呕！#《鲜切》苹果#难吃，又不甜！网购第一次差评...</td>\n",
       "      <td>52</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>57</th>\n",
       "      <td>1</td>\n",
       "      <td>卧槽绝对地沟油，油都是黑色的</td>\n",
       "      <td>57</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>70</th>\n",
       "      <td>1</td>\n",
       "      <td>我勒个去  吃到一半吃掉一只蟑螂 刚想吃一下看着不对  太恶心了</td>\n",
       "      <td>70</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>74</th>\n",
       "      <td>1</td>\n",
       "      <td>一点芝士都没有，差，差，差，难吃。配送也慢。一点也不开心。</td>\n",
       "      <td>74</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>93</th>\n",
       "      <td>1</td>\n",
       "      <td>汉堡里面的肉都是生的。而且还有一个汉堡没有，而且薯条量少又难吃，</td>\n",
       "      <td>93</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>98</th>\n",
       "      <td>1</td>\n",
       "      <td>泡芙都是霉的，每一个上面都有霉点！昨天晚上买的，今早上才发现！！我居然吃了四五个，想着都要吐了！！</td>\n",
       "      <td>98</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>103</th>\n",
       "      <td>1</td>\n",
       "      <td>能告诉我为什么米饭是馊的吗</td>\n",
       "      <td>103</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>111</th>\n",
       "      <td>1</td>\n",
       "      <td>不知怎么的，泡椒血旺是臭的！</td>\n",
       "      <td>111</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>113</th>\n",
       "      <td>1</td>\n",
       "      <td>刚才掉地上了，说的重新送，是回收的吧，还有一块石头，太垃圾了，等了这么久，摔了我又没有怪你们...</td>\n",
       "      <td>113</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>123</th>\n",
       "      <td>1</td>\n",
       "      <td>饭是馊的，怎么看都不想香辣鸡</td>\n",
       "      <td>123</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>128</th>\n",
       "      <td>1</td>\n",
       "      <td>花甲已经臭掉了、汤汁还有一股馊味</td>\n",
       "      <td>128</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>140</th>\n",
       "      <td>1</td>\n",
       "      <td>太垃圾了  东西还有臭的</td>\n",
       "      <td>140</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>141</th>\n",
       "      <td>1</td>\n",
       "      <td>饭里有钢丝。第二次了。第一次没说</td>\n",
       "      <td>141</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>142</th>\n",
       "      <td>1</td>\n",
       "      <td>老 子 吃到一个钢丝球 没注意 吞下去了 还有一个铁丝 从嘴里抠了出来 弄得我喉咙都发炎了 ...</td>\n",
       "      <td>142</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>152</th>\n",
       "      <td>1</td>\n",
       "      <td>一打开就看到虫子 恶心得一天都吃不下饭...</td>\n",
       "      <td>152</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>156</th>\n",
       "      <td>1</td>\n",
       "      <td>没有筷子，而且感觉一点都不新鲜，昨天晚上吃了，现在竟然拉肚子</td>\n",
       "      <td>156</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>157</th>\n",
       "      <td>1</td>\n",
       "      <td>吃到一半发现中间有根钢丝球，还拍了照片</td>\n",
       "      <td>157</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>160</th>\n",
       "      <td>1</td>\n",
       "      <td>凉皮有味道了，吃完一天肚子都不舒服，拉肚子，不是第一次在这家吃了，希望有所改进</td>\n",
       "      <td>160</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>163</th>\n",
       "      <td>1</td>\n",
       "      <td>臭臭的，很难吃。各位小主不要点。</td>\n",
       "      <td>163</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>169</th>\n",
       "      <td>1</td>\n",
       "      <td>吃到一半突然有只虫！</td>\n",
       "      <td>169</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>170</th>\n",
       "      <td>1</td>\n",
       "      <td>这家店原来这么不卫生，吃到3盒饭都是小强腿，苍蝇，可惜忘了拍，最郁闷的是整只小强，吃得我们吐...</td>\n",
       "      <td>170</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>176</th>\n",
       "      <td>1</td>\n",
       "      <td>难吃，而且一点不新鲜，吃完后拉肚子</td>\n",
       "      <td>176</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>182</th>\n",
       "      <td>1</td>\n",
       "      <td>shen#秘制卤花甲##烤带子##烤扇贝王#花甲里面暴多的沙子，2人各吃一个就没吃下去的心情...</td>\n",
       "      <td>182</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9863</th>\n",
       "      <td>1</td>\n",
       "      <td>什么鬼，鸡翅是臭的，不知道冻了多久了</td>\n",
       "      <td>9863</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9869</th>\n",
       "      <td>1</td>\n",
       "      <td>看到评论说很好吃，我就来了一份，结果，肉丝，是一大坨，而不是丝，更可恶的是，还吃了两条虫出来...</td>\n",
       "      <td>9869</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9872</th>\n",
       "      <td>1</td>\n",
       "      <td>米饭和剩米一样，土豆也好多黑的，感觉吃剩菜一样</td>\n",
       "      <td>9872</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9874</th>\n",
       "      <td>1</td>\n",
       "      <td>饭菜一点都不新鲜，泡椒鸡杂有股说不出来的馊味，回锅肉也是，也不知道吃完会不会食物中毒，差评</td>\n",
       "      <td>9874</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9885</th>\n",
       "      <td>1</td>\n",
       "      <td>买了这么多次，这次的虾子最差，以前几只死的就算了，今天就三只新鲜虾子，其他的都有气味了，</td>\n",
       "      <td>9885</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9893</th>\n",
       "      <td>1</td>\n",
       "      <td>土豆丝里面有小强！！！怎么吃啊！</td>\n",
       "      <td>9893</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9894</th>\n",
       "      <td>1</td>\n",
       "      <td>吃饭吃到钢丝你能信？</td>\n",
       "      <td>9894</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9897</th>\n",
       "      <td>1</td>\n",
       "      <td>吃了直接拉肚子了</td>\n",
       "      <td>9897</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9903</th>\n",
       "      <td>1</td>\n",
       "      <td>番茄丸子汤变味了，室友都说像是馊的，不敢再喝了</td>\n",
       "      <td>9903</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9905</th>\n",
       "      <td>1</td>\n",
       "      <td>豆浆很好喝，为了豆浆点你们家。这个皮蛋瘦肉粥！我简直无法下咽……皮蛋臭臭的就算了，粥是用什么...</td>\n",
       "      <td>9905</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9907</th>\n",
       "      <td>1</td>\n",
       "      <td>西瓜很不新鲜 香蕉也是送的发黑的烂的。送到时袋子里都是西瓜水。</td>\n",
       "      <td>9907</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9916</th>\n",
       "      <td>1</td>\n",
       "      <td>吃了的人全都拉肚子，不想多说集体在医院打点滴</td>\n",
       "      <td>9916</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9923</th>\n",
       "      <td>1</td>\n",
       "      <td>很难吃，肉都是臭的。放了这么多辣椒都盖不住臭味，你们到底是放了多久的肉！！！这么臭的肉都舍不...</td>\n",
       "      <td>9923</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9927</th>\n",
       "      <td>1</td>\n",
       "      <td>汤是馊的，菜咸的简直了</td>\n",
       "      <td>9927</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9933</th>\n",
       "      <td>1</td>\n",
       "      <td>吃了啦了一天的肚子，而且牛蛙很不干净，啦脱水了。</td>\n",
       "      <td>9933</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9937</th>\n",
       "      <td>1</td>\n",
       "      <td>1根本不是五花肉小串，街边那种红肉串；2，肥肠不是很熟，导致臭味太浓</td>\n",
       "      <td>9937</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9939</th>\n",
       "      <td>1</td>\n",
       "      <td>头发头发！注意点卫生可好！</td>\n",
       "      <td>9939</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9954</th>\n",
       "      <td>1</td>\n",
       "      <td>巨难吃，点的微辣，一点辣椒都没有。肉有异味</td>\n",
       "      <td>9954</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9956</th>\n",
       "      <td>1</td>\n",
       "      <td>猪血臭了，害得一碗都有味道，吃都没吃</td>\n",
       "      <td>9956</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9958</th>\n",
       "      <td>1</td>\n",
       "      <td>肉都是臭的！吃了拉肚子，气死人了</td>\n",
       "      <td>9958</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9961</th>\n",
       "      <td>1</td>\n",
       "      <td>太不干净了。吃到一半居然有块布在里面。瞬间都吃不下了</td>\n",
       "      <td>9961</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9962</th>\n",
       "      <td>1</td>\n",
       "      <td>猪脚都是臭的。</td>\n",
       "      <td>9962</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9964</th>\n",
       "      <td>1</td>\n",
       "      <td>果盘里有虫子，特别明显，不能因为我是买的外卖就这么坑人吧</td>\n",
       "      <td>9964</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9966</th>\n",
       "      <td>1</td>\n",
       "      <td>滑肉变味了，有点臭</td>\n",
       "      <td>9966</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9980</th>\n",
       "      <td>1</td>\n",
       "      <td>重来不做差评的，第一次，首先味道很难吃，有史以来吃过最难吃的烧烤，其次菜品有问题，吃了一点拉...</td>\n",
       "      <td>9980</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9986</th>\n",
       "      <td>1</td>\n",
       "      <td>包装不错，可是有股味儿感觉放了好久，吃了就拉肚子了</td>\n",
       "      <td>9986</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9989</th>\n",
       "      <td>1</td>\n",
       "      <td>气死我了 味道是很好吃 但是老板你根本烤都没烤熟 还没吃完就一直拉肚子 从昨晚拉到现在 再怎...</td>\n",
       "      <td>9989</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9991</th>\n",
       "      <td>1</td>\n",
       "      <td>第一次吃你们家三文鱼的时候上桌时连冰都没退，这次点外卖呢，三文鱼就跟被风干过一样…牛逼</td>\n",
       "      <td>9991</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9993</th>\n",
       "      <td>1</td>\n",
       "      <td>我和老公吃了都拉肚子了，人均5次，真的伤不起(&gt;﹏&lt;)</td>\n",
       "      <td>9993</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9995</th>\n",
       "      <td>1</td>\n",
       "      <td>有小蟑螂，太不卫生了</td>\n",
       "      <td>9995</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1511 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      label                                            comment    id\n",
       "12        1  恶心，不想说了，喝嘴里嚼两下觉得口感不对，吐出来就是只小蟑螂，关键是我还嚼了两口，昨天喝了，...    12\n",
       "19        1                 真的无语，上两次还觉得可以，今天这个菠萝包里头奶油都臭了，完全不能吃    19\n",
       "21        1                              饭里面居然还有虫，好恶心，不想吃了！差评！    21\n",
       "22        1  给一个星星❤️完全是给外卖小哥的，为了给我女朋友送外卖都摔了，心疼。但是这个芒果不能忍，女朋...    22\n",
       "23        1                                    吃比萨还有虫子，这是什么商家？    23\n",
       "44        1                               别的味道我先不说   吃了拉肚子才是真的    44\n",
       "47        1                           和室友一起吃了全体拉肚子 今天是第二天又拉了三次    47\n",
       "52        1  #【鲜切】海南香蕉王#吃完反胃想吐，整晚都干呕！#《鲜切》苹果#难吃，又不甜！网购第一次差评...    52\n",
       "57        1                                     卧槽绝对地沟油，油都是黑色的    57\n",
       "70        1                   我勒个去  吃到一半吃掉一只蟑螂 刚想吃一下看着不对  太恶心了    70\n",
       "74        1                      一点芝士都没有，差，差，差，难吃。配送也慢。一点也不开心。    74\n",
       "93        1                   汉堡里面的肉都是生的。而且还有一个汉堡没有，而且薯条量少又难吃，    93\n",
       "98        1  泡芙都是霉的，每一个上面都有霉点！昨天晚上买的，今早上才发现！！我居然吃了四五个，想着都要吐了！！    98\n",
       "103       1                                      能告诉我为什么米饭是馊的吗   103\n",
       "111       1                                     不知怎么的，泡椒血旺是臭的！   111\n",
       "113       1  刚才掉地上了，说的重新送，是回收的吧，还有一块石头，太垃圾了，等了这么久，摔了我又没有怪你们...   113\n",
       "123       1                                     饭是馊的，怎么看都不想香辣鸡   123\n",
       "128       1                                   花甲已经臭掉了、汤汁还有一股馊味   128\n",
       "140       1                                       太垃圾了  东西还有臭的   140\n",
       "141       1                                   饭里有钢丝。第二次了。第一次没说   141\n",
       "142       1  老 子 吃到一个钢丝球 没注意 吞下去了 还有一个铁丝 从嘴里抠了出来 弄得我喉咙都发炎了 ...   142\n",
       "152       1                             一打开就看到虫子 恶心得一天都吃不下饭...   152\n",
       "156       1                     没有筷子，而且感觉一点都不新鲜，昨天晚上吃了，现在竟然拉肚子   156\n",
       "157       1                                吃到一半发现中间有根钢丝球，还拍了照片   157\n",
       "160       1            凉皮有味道了，吃完一天肚子都不舒服，拉肚子，不是第一次在这家吃了，希望有所改进   160\n",
       "163       1                                   臭臭的，很难吃。各位小主不要点。   163\n",
       "169       1                                         吃到一半突然有只虫！   169\n",
       "170       1  这家店原来这么不卫生，吃到3盒饭都是小强腿，苍蝇，可惜忘了拍，最郁闷的是整只小强，吃得我们吐...   170\n",
       "176       1                                  难吃，而且一点不新鲜，吃完后拉肚子   176\n",
       "182       1  shen#秘制卤花甲##烤带子##烤扇贝王#花甲里面暴多的沙子，2人各吃一个就没吃下去的心情...   182\n",
       "...     ...                                                ...   ...\n",
       "9863      1                                 什么鬼，鸡翅是臭的，不知道冻了多久了  9863\n",
       "9869      1  看到评论说很好吃，我就来了一份，结果，肉丝，是一大坨，而不是丝，更可恶的是，还吃了两条虫出来...  9869\n",
       "9872      1                            米饭和剩米一样，土豆也好多黑的，感觉吃剩菜一样  9872\n",
       "9874      1      饭菜一点都不新鲜，泡椒鸡杂有股说不出来的馊味，回锅肉也是，也不知道吃完会不会食物中毒，差评  9874\n",
       "9885      1       买了这么多次，这次的虾子最差，以前几只死的就算了，今天就三只新鲜虾子，其他的都有气味了，  9885\n",
       "9893      1                                   土豆丝里面有小强！！！怎么吃啊！  9893\n",
       "9894      1                                         吃饭吃到钢丝你能信？  9894\n",
       "9897      1                                           吃了直接拉肚子了  9897\n",
       "9903      1                            番茄丸子汤变味了，室友都说像是馊的，不敢再喝了  9903\n",
       "9905      1  豆浆很好喝，为了豆浆点你们家。这个皮蛋瘦肉粥！我简直无法下咽……皮蛋臭臭的就算了，粥是用什么...  9905\n",
       "9907      1                   西瓜很不新鲜 香蕉也是送的发黑的烂的。送到时袋子里都是西瓜水。   9907\n",
       "9916      1                             吃了的人全都拉肚子，不想多说集体在医院打点滴  9916\n",
       "9923      1  很难吃，肉都是臭的。放了这么多辣椒都盖不住臭味，你们到底是放了多久的肉！！！这么臭的肉都舍不...  9923\n",
       "9927      1                                        汤是馊的，菜咸的简直了  9927\n",
       "9933      1                           吃了啦了一天的肚子，而且牛蛙很不干净，啦脱水了。  9933\n",
       "9937      1                 1根本不是五花肉小串，街边那种红肉串；2，肥肠不是很熟，导致臭味太浓  9937\n",
       "9939      1                                      头发头发！注意点卫生可好！  9939\n",
       "9954      1                              巨难吃，点的微辣，一点辣椒都没有。肉有异味  9954\n",
       "9956      1                                 猪血臭了，害得一碗都有味道，吃都没吃  9956\n",
       "9958      1                                   肉都是臭的！吃了拉肚子，气死人了  9958\n",
       "9961      1                         太不干净了。吃到一半居然有块布在里面。瞬间都吃不下了  9961\n",
       "9962      1                                            猪脚都是臭的。  9962\n",
       "9964      1                       果盘里有虫子，特别明显，不能因为我是买的外卖就这么坑人吧  9964\n",
       "9966      1                                          滑肉变味了，有点臭  9966\n",
       "9980      1  重来不做差评的，第一次，首先味道很难吃，有史以来吃过最难吃的烧烤，其次菜品有问题，吃了一点拉...  9980\n",
       "9986      1                          包装不错，可是有股味儿感觉放了好久，吃了就拉肚子了  9986\n",
       "9989      1  气死我了 味道是很好吃 但是老板你根本烤都没烤熟 还没吃完就一直拉肚子 从昨晚拉到现在 再怎...  9989\n",
       "9991      1        第一次吃你们家三文鱼的时候上桌时连冰都没退，这次点外卖呢，三文鱼就跟被风干过一样…牛逼  9991\n",
       "9993      1                        我和老公吃了都拉肚子了，人均5次，真的伤不起(>﹏<)  9993\n",
       "9995      1                                         有小蟑螂，太不卫生了  9995\n",
       "\n",
       "[1511 rows x 3 columns]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train[train.label==1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "pycharm": {
     "is_executing": false
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Building prefix dict from the default dictionary ...\n",
      "Loading model from cache C:\\Users\\MLoong\\AppData\\Local\\Temp\\jieba.cache\n",
      "Loading model cost 0.589 seconds.\n",
      "Prefix dict has been built succesfully.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[('，', 3521),\n",
       " ('了', 2043),\n",
       " ('的', 1791),\n",
       " (' ', 1272),\n",
       " ('吃', 992),\n",
       " ('！', 929),\n",
       " ('。', 914),\n",
       " ('是', 881),\n",
       " ('不', 674),\n",
       " ('都', 665),\n",
       " ('我', 617),\n",
       " ('有', 375),\n",
       " ('也', 343),\n",
       " ('还', 336),\n",
       " ('就', 307),\n",
       " ('#', 303),\n",
       " ('？', 268),\n",
       " ('拉肚子', 263),\n",
       " ('臭', 259),\n",
       " ('味道', 240),\n",
       " ('说', 223),\n",
       " ('里面', 219),\n",
       " ('很', 201),\n",
       " ('点', 181),\n",
       " ('没', 180),\n",
       " ('…', 180),\n",
       " ('在', 179),\n",
       " ('难吃', 175),\n",
       " ('到', 174),\n",
       " ('给', 173),\n",
       " ('没有', 173),\n",
       " ('肉', 171),\n",
       " ('你', 159),\n",
       " ('还有', 149),\n",
       " ('新鲜', 145),\n",
       " ('头发', 141),\n",
       " ('太', 134),\n",
       " ('恶心', 132),\n",
       " ('好', 127),\n",
       " ('真的', 126),\n",
       " ('差评', 125),\n",
       " ('一个', 125),\n",
       " ('卫生', 124),\n",
       " ('知道', 120),\n",
       " ('买', 119),\n",
       " ('你们', 115),\n",
       " ('外卖', 114),\n",
       " ('饭', 110),\n",
       " ('还是', 109),\n",
       " ('居然', 108),\n",
       " ('这', 107),\n",
       " ('馊', 105),\n",
       " ('菜', 103),\n",
       " ('虫', 97),\n",
       " ('什么', 96),\n",
       " ('今天', 95),\n",
       " ('这个', 94),\n",
       " ('第一次', 93),\n",
       " ('东西', 93),\n",
       " ('吗', 92),\n",
       " ('商家', 91),\n",
       " ('人', 90),\n",
       " ('就是', 89),\n",
       " ('和', 89),\n",
       " ('这么', 88),\n",
       " ('怎么', 87),\n",
       " ('不是', 87),\n",
       " ('好吃', 84),\n",
       " ('可以', 81),\n",
       " ('但是', 81),\n",
       " ('而且', 80),\n",
       " ('一股', 76),\n",
       " ('感觉', 76),\n",
       " ('吧', 75),\n",
       " ('&', 74),\n",
       " ('nbsp', 74),\n",
       " (';', 74),\n",
       " ('完', 73),\n",
       " ('不会', 73),\n",
       " ('又', 72),\n",
       " ('##', 71),\n",
       " ('一点', 70),\n",
       " ('个', 68),\n",
       " ('里', 68),\n",
       " ('送', 67),\n",
       " ('发现', 67),\n",
       " ('得', 67),\n",
       " ('想', 65),\n",
       " ('虫子', 64),\n",
       " ('这样', 64),\n",
       " ('出来', 64),\n",
       " ('死', 64),\n",
       " ('要', 63),\n",
       " ('坏', 63),\n",
       " ('吐', 62),\n",
       " ('这次', 62),\n",
       " ('多', 61),\n",
       " ('才', 61),\n",
       " ('差', 61),\n",
       " ('老板', 60)]"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import jieba\n",
    "from collections import Counter\n",
    "word_list=[]\n",
    "for cmt in train[train.label==1].comment:\n",
    "    word_list.extend([w for w in jieba.cut(cmt)])\n",
    "vocab=Counter(word_list)\n",
    "vocab.most_common(100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "pycharm": {
     "is_executing": false
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>comment_len</th>\n",
       "      <th>word_len</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>12000.000000</td>\n",
       "      <td>12000.000000</td>\n",
       "      <td>12000.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>-0.040750</td>\n",
       "      <td>33.944667</td>\n",
       "      <td>22.076083</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>0.539395</td>\n",
       "      <td>32.964640</td>\n",
       "      <td>21.415742</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>-1.000000</td>\n",
       "      <td>5.000000</td>\n",
       "      <td>2.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>16.000000</td>\n",
       "      <td>10.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>22.000000</td>\n",
       "      <td>14.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>38.000000</td>\n",
       "      <td>25.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>255.000000</td>\n",
       "      <td>179.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              label   comment_len      word_len\n",
       "count  12000.000000  12000.000000  12000.000000\n",
       "mean      -0.040750     33.944667     22.076083\n",
       "std        0.539395     32.964640     21.415742\n",
       "min       -1.000000      5.000000      2.000000\n",
       "25%        0.000000     16.000000     10.000000\n",
       "50%        0.000000     22.000000     14.000000\n",
       "75%        0.000000     38.000000     25.000000\n",
       "max        1.000000    255.000000    179.000000"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import jieba\n",
    "df['token_comment']=df['comment'].apply(lambda x:\" \".join(jieba.cut(x)))\n",
    "df['word_len']=df['token_comment'].apply(lambda x:len(x.split()))\n",
    "df.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "pycharm": {
     "is_executing": false
    }
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
